library(ggplot2)
library(phyloseq)
library(randomForest)
library(tidyverse)
library(ggpubr)
library(microbiome)
library(dysbiosisR)
library(ggvenn)
library(NBZIMM)
library(ZIBR)
library(reshape)
library(RColorBrewer)
ps_data <- readRDS("data/ps_data")
#remove Caucasian subjects
ps_data  <- subset_samples(ps_data, Race_ID == "African.American")

# Extract sample data as a data frame
sample_df <- as_data_frame(sample_data(ps_data))
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` (with slightly different semantics) to convert to a
##   tibble, or `as.data.frame()` to convert to a data frame.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Add Age_Cat2
sample_df <- sample_df %>%
  mutate(Age = as.numeric(as.character(Age))) %>%
  mutate(Age_Cat2 = case_when(
    Age < 40 ~ "Less than 40", 
    Age >= 40 & Age < 60 ~ "40 to 59", 
    Age >= 60 ~ "60+"
  ))

sample_df <- as.data.frame(sample_df)

#Preserve original rownames (sample names)
rownames(sample_df) <- rownames(ps_data@sam_data)


identical(colnames(ps_data@otu_table), rownames(sample_df))
## [1] TRUE
# add back to physeq
sample_data(ps_data) <- sample_data(sample_df)

#convert phylseq object into a dataframe 
ps_data.df <- psmelt(ps_data)

#cleanup Genus names
ps_data.df$Genus <- sub("g__", "", ps_data.df$Genus)

#cleanup Phyla names
ps_data.df$Phylum <- sub("p__", "", ps_data.df$Phylum)

#transform counts to relative abundances
ps_rel <- transform_sample_counts(ps_data, function(OTU) OTU/sum(OTU))

#convert the relative abundance data into a dataframe
ps_rel.df <- psmelt(ps_rel)

#cleanup Genus names
ps_rel.df$Genus <- sub("g__", "", ps_rel.df$Genus)
#set the theme for all plots
theme_set(theme_bw()) 

#a color palette for use in plotting
palette <- c("lightpink", "cornflowerblue", "yellow2", "darkblue", "darkorange1", "deeppink", "darkviolet", "palegreen",  "springgreen4", "black", "goldenrod", "red", "blue", "#3DB7E9",  "#D55E00", "#F0E442",'#253494', "gray50", "purple", "orange", "lightgreen", "forestgreen") 

Explore metadata

df.meta <- as(sample_data(ps_data), "data.frame") 
df.meta.unique <- df.meta[!duplicated(df.meta$Person_ID), ]

df.meta.unique %>% count(Hair_Type)
##           Hair_Type  n
## 1             Color  5
## 2 Keratin.Treatment  1
## 3           Natural 20
## 4           Relaxed  8
## 5              <NA>  2
df.meta.unique %>% count(Race_ID)
##            Race_ID  n
## 1 African.American 36
df.meta.unique %>% count(Gender)
##   Gender  n
## 1 Female 29
## 2   Male  7
df.meta.unique %>% count(Cleansing_Freq)
##   Cleansing_Freq  n
## 1       Biweekly 16
## 2          Daily  1
## 3        Monthly  7
## 4         Weekly 12
range(df.meta.unique$Age)
## [1] 23 73
#check the number of participants 
length(unique(df.meta$Person_ID))
## [1] 36
#check the number of samples per participants
df.meta %>% 
  count(Person_ID) %>% 
  arrange(desc(n))
##    Person_ID n
## 1  Person.02 2
## 2  Person.03 2
## 3  Person.05 2
## 4  Person.07 2
## 5  Person.09 2
## 6  Person.10 2
## 7  Person.11 2
## 8  Person.12 2
## 9  Person.13 2
## 10 Person.14 2
## 11 Person.15 2
## 12 Person.17 2
## 13 Person.18 2
## 14 Person.19 2
## 15 Person.22 2
## 16 Person.23 2
## 17 Person.24 2
## 18 Person.25 2
## 19 Person.26 2
## 20 Person.29 2
## 21 Person.32 2
## 22 Person.33 2
## 23 Person.35 2
## 24 Person.36 2
## 25 Person.37 2
## 26 Person.40 2
## 27 Person.42 2
## 28 Person.44 2
## 29 Person.46 2
## 30 Person.47 2
## 31 Person.49 2
## 32 Person.06 1
## 33 Person.16 1
## 34 Person.27 1
## 35 Person.41 1
## 36 Person.45 1
#check the number of samples per hair site type
df.meta %>% 
  count(Area_Cond)
##   Area_Cond  n
## 1 Afflicted 35
## 2    Normal 32
#check the age ranges 

range(df.meta$Age)
## [1] 23 73
# Density
ggplot(df.meta.unique, aes(x = Age, y = Density_Scale)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_classic() +
  stat_cor() +
  labs(x = "Age (years)",
       y = "Scalp Density") +
 theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(), 
        axis.text.x = element_text(family = "Arial", size = 12),
        axis.text.y = element_text(family = "Arial", size = 12))+
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

#  stress
ggplot(df.meta.unique, aes(x = Age, y = Stress_Scale)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_classic() +
  stat_cor() +
  labs(x = "Age (years)",
       y = "Scalp Stress") +
  theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(), 
        axis.text.x = element_text(family = "Arial", size = 12),
        axis.text.y = element_text(family = "Arial", size = 12))+
  theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

#check the number of samples per participants
df.meta %>% 
  count(Person_ID, X.SampleID)
##    Person_ID X.SampleID n
## 1  Person.02      140GP 1
## 2  Person.02      143GP 1
## 3  Person.03      098GP 1
## 4  Person.03      099GP 1
## 5  Person.05      213GP 1
## 6  Person.05      240GP 1
## 7  Person.06      234GP 1
## 8  Person.07      080GP 1
## 9  Person.07      084GP 1
## 10 Person.09      238GP 1
## 11 Person.09      239GP 1
## 12 Person.10      231GP 1
## 13 Person.10      232GP 1
## 14 Person.11      183GP 1
## 15 Person.11      188GP 1
## 16 Person.12      070GP 1
## 17 Person.12      078GP 1
## 18 Person.13      062GP 1
## 19 Person.13      072GP 1
## 20 Person.14      211GP 1
## 21 Person.14      216GP 1
## 22 Person.15      008GP 1
## 23 Person.15      009GP 1
## 24 Person.16      174GP 1
## 25 Person.17      064GP 1
## 26 Person.17      068GP 1
## 27 Person.18      142GP 1
## 28 Person.18      148GP 1
## 29 Person.19      173GP 1
## 30 Person.19      175GP 1
## 31 Person.22      090GP 1
## 32 Person.22      091GP 1
## 33 Person.23      144GP 1
## 34 Person.23      146GP 1
## 35 Person.24      243GP 1
## 36 Person.24      244GP 1
## 37 Person.25      092GP 1
## 38 Person.25      093GP 1
## 39 Person.26      104GP 1
## 40 Person.26      107GP 1
## 41 Person.27      241GP 1
## 42 Person.29      082GP 1
## 43 Person.29      086GP 1
## 44 Person.32      200GP 1
## 45 Person.32      202GP 1
## 46 Person.33      102GP 1
## 47 Person.33      109GP 1
## 48 Person.35      118GP 1
## 49 Person.35      119GP 1
## 50 Person.36      246GP 1
## 51 Person.36      247GP 1
## 52 Person.37      248GP 1
## 53 Person.37      249GP 1
## 54 Person.40      236GP 1
## 55 Person.40      237GP 1
## 56 Person.41      060GP 1
## 57 Person.42      201GP 1
## 58 Person.42      203GP 1
## 59 Person.44      096GP 1
## 60 Person.44      097GP 1
## 61 Person.45      204GP 1
## 62 Person.46      209GP 1
## 63 Person.46      210GP 1
## 64 Person.47      094GP 1
## 65 Person.47      095GP 1
## 66 Person.49      207GP 1
## 67 Person.49      208GP 1

Explore taxonomy

#total number of sequences
sum(ps_data.df$Abundance)
## [1] 1720130
#percentage of each phylum in descending order
percent_phyla <- ps_data.df %>% 
  group_by(Phylum) %>% 
  summarize(sum = sum(Abundance/1720130)*100) %>% 
  arrange(desc(sum))
percent_phyla 
## # A tibble: 38 × 2
##    Phylum              sum
##    <chr>             <dbl>
##  1 Firmicutes      50.6   
##  2 Actinobacteria  33.0   
##  3 Proteobacteria  11.6   
##  4 Cyanobacteria    3.15  
##  5 Bacteroidetes    1.14  
##  6 Fusobacteria     0.283 
##  7 Verrucomicrobia  0.0400
##  8 Acidobacteria    0.0292
##  9 <NA>             0.0246
## 10 Planctomycetes   0.0221
## # ℹ 28 more rows
#percentage of each genus in descending order
percent_genera <- ps_data.df %>% 
  group_by(Genus) %>% 
  summarize(sum = sum(Abundance/1720130)*100) %>% 
  arrange(desc(sum))
percent_genera
## # A tibble: 380 × 2
##    Genus                 sum
##    <chr>               <dbl>
##  1 "Staphylococcus"    41.8 
##  2 "Corynebacterium"   26.8 
##  3 ""                   7.69
##  4 "Propionibacterium"  4.04
##  5 "Acinetobacter"      3.08
##  6 "Streptococcus"      2.80
##  7 "Anaerococcus"       1.77
##  8 "Enhydrobacter"      1.52
##  9 "Burkholderia"       1.18
## 10  <NA>                1.16
## # ℹ 370 more rows
ps_data.df %>% 
  filter(Genus == c("Staphylococcus", "Corynebacterium", "Propionibacterium")) %>% 
  group_by(Genus, Area_Cond) %>% 
  summarize(average = sum(Abundance/1720130)*100)
## `summarise()` has grouped output by 'Genus'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 3
## # Groups:   Genus [3]
##   Genus             Area_Cond average
##   <chr>             <chr>       <dbl>
## 1 Corynebacterium   Afflicted   4.24 
## 2 Corynebacterium   Normal      5.42 
## 3 Propionibacterium Afflicted   0.781
## 4 Propionibacterium Normal      0.513
## 5 Staphylococcus    Afflicted   5.88 
## 6 Staphylococcus    Normal      9.64
ps_data.df %>% 
  filter(Genus == c("Staphylococcus", "Corynebacterium", "Propionibacterium")) %>% 
  group_by(Genus, Area_Cond) %>% 
  summarize(average = sum(Abundance/1720130)*100) %>% 
  ggplot(aes(Area_Cond, average, fill = Genus))+
  geom_bar(stat="identity")+
    coord_flip()+
    theme_bw()+
  scale_fill_viridis_d()+
  labs(x = "", y = "Average Relative Abundance (%)")
## `summarise()` has grouped output by 'Genus'. You can override using the
## `.groups` argument.

plot_df <- ps_rel.df %>% 
  mutate(Genus = ifelse(Abundance < 0.05, " Other < 5%", Genus),
         Genus = ifelse(is.na(Genus) | Genus == "", "Unclassified", Genus)) %>%
  group_by(Sample, Area_Cond, Genus) %>% 
  summarize(Abundance = sum(Abundance), .groups = "drop") %>%
  group_by(Area_Cond, Genus) %>%
  summarize(mean_abundance = mean(Abundance) * 100,
    sd_abundance = sd(Abundance) * 100,
    .groups = "drop")

ggplot(plot_df, aes(x = Genus, y = mean_abundance, fill = Area_Cond)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.9), alpha=0.7) +
  geom_errorbar(aes(ymin = mean_abundance - sd_abundance, ymax = mean_abundance + sd_abundance),
                width = 0.3, position = position_dodge(width = 0.9)) +
  theme_bw() +
  labs(x = "", y = "Average Relative Abundance (%)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), 
        legend.title = element_blank()) +
  stat_compare_means(
    method = "wilcox",
    paired = TRUE,
    hide.ns = TRUE,
    label = "p.signif") +
  scale_fill_manual(values = c("red", "black"))+
  coord_flip()

ggsave("figures/Supplemental_AverageRelativeAbd_Genus.pdf")
## Saving 7 x 5 in image

Alpha diversity

#generate alpha diversity metrics 
alpha <- microbiome::alpha(ps_data, index = "all")
## Observed richness
## Other forms of richness
## Diversity
## Evenness
## Dominance
## Rarity
#assign sample names to the new data 
alpha$X.SampleID <- rownames(alpha)

# extract sample data from the phyloseq object
df <- as(sample_data(ps_data), "data.frame")

# Merge Tables
alpha.table <- merge(alpha, df, by = "X.SampleID")
#Shannon diversity
 ggplot(alpha.table, aes(x = Area_Cond, y = diversity_shannon, fill = Area_Cond)) +
  geom_boxplot() +
    stat_compare_means(method="wilcox") +
  labs(x = "Skin Site",
       y = "Shannon") +
    theme(axis.line.y = element_blank(),
          axis.ticks.y = element_blank(), 
          legend.title = element_blank())

#Number of OTUs
 ggplot(alpha.table, aes(x = Area_Cond, y = observed, fill = Area_Cond)) +
  geom_boxplot() +
    stat_compare_means() +
  labs(x = "Skin Site",
       y = "Oberserved OTUs") +
    theme(axis.line.y = element_blank(),
          axis.ticks.y = element_blank())

 #Calculate average number of OTUs per group
 alpha.table %>% 
   group_by(Area_Cond) %>% 
   summarise(mean(observed))
## # A tibble: 2 × 2
##   Area_Cond `mean(observed)`
##   <chr>                <dbl>
## 1 Afflicted             363.
## 2 Normal                297.
#Shannon vs Age
ggplot(alpha.table,
       aes(x = Age, y = diversity_shannon, fill = Area_Cond, color = Area_Cond)) +
    geom_point(aes(color = Area_Cond)) +
    geom_smooth(method = "lm", fill = "gray", guide = "none") +
    stat_cor() +
    labs(x = "Age", y = "Shannon Diversity", title = "A") +
    scale_color_manual(values = c("red", "black"))+
    guides(color = guide_legend(override.aes = list(shape = 16, size = 3, fill = NA)))+
    theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(), 
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12), 
        legend.text = element_text(size = 10), 
        legend.title = element_blank())
## Warning in geom_smooth(method = "lm", fill = "gray", guide = "none"): Ignoring
## unknown parameters: `guide`
## `geom_smooth()` using formula = 'y ~ x'

ggsave("figures/Fig1a.pdf", height = 4, width = 5)
## `geom_smooth()` using formula = 'y ~ x'
alpha.table %>% 
  mutate(Age_Cat2 = case_when(Age < 40 ~ "1) Less than 40", 
                              Age < 60 & Age > 39  ~ "2) 40 to 59", 
                              Age > 59 ~ "3) 60+")) %>% 
  ggplot(aes(x = Area_Cond, y = diversity_shannon)) +
  geom_boxplot(aes(color = Area_Cond), show.legend = F) +
  geom_point(aes(color = Area_Cond))+
  stat_compare_means(method="wilcox") +
  stat_cor()+
  scale_color_manual(values = c("red", "black"))+
  facet_wrap(~Age_Cat2)+
  guides(color = guide_legend(override.aes = list(shape = 16, size = 3, fill = NA)))+
  labs(x = " ", y = "Shannon Diversity", title = "B") +
      theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(), 
        axis.text.x = element_text(size = 12),
        axis.text.y = element_text(size = 12),
        legend.position = "none")

ggsave("figures/Fig1b.pdf", height = 4, width = 5)
alpha.table %>% 
  filter(Area_Cond == "Afflicted") %>% 
  ggplot(aes(x = Cleansing_Freq, y = diversity_shannon, fill = Cleansing_Freq)) +
  geom_boxplot() +
    stat_compare_means(label = "p.signif") +
  geom_point()+
  labs(x = " ",
       y = "Shannon Diversity") +
      theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(), 
        axis.text.x = element_text(family = "Arial", size = 12),
        axis.text.y = element_text(family = "Arial", size = 12), 
        legend.text = element_text(family = "Arial", size = 10))

alpha.table %>% 
  filter(Area_Cond == "Afflicted") %>%
  mutate(Type = ifelse(Hair_Type == "Natural", "Natural",  "Other")) %>%
  filter(Type != is.na(Type)) %>% 
  ggplot(aes(x = Type, y = diversity_shannon, fill = Type)) +
  geom_boxplot() +
    stat_compare_means(label = "p.signif") +
  geom_point()+
  labs(x = " ",
       y = "Shannon Diversity") +
      theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(), 
        axis.text.x = element_text(family = "Arial", size = 12),
        axis.text.y = element_text(family = "Arial", size = 12), 
        legend.text = element_text(family = "Arial", size = 10))

alpha.table %>% 
  filter(Area_Cond == "Afflicted") %>% 
  ggplot(aes(x = Hair_Type, y = diversity_shannon, fill = Hair_Type)) +
  geom_boxplot() +
    stat_compare_means(label = "p.signif") +
  geom_point()+
  labs(x = " ",
       y = "Shannon Diversity") +
      theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(), 
        axis.text.x = element_text(family = "Arial", size = 12),
        axis.text.y = element_text(family = "Arial", size = 12), 
        legend.text = element_text(family = "Arial", size = 10))

alpha.table %>% 
  filter(Area_Cond == "Afflicted") %>% 
  ggplot(aes(x = Gender, y = diversity_shannon, fill = Gender)) +
  geom_boxplot() +
    stat_compare_means(label = "p.signif") +
  geom_point()+
  labs(x = " ",
       y = "Shannon Diversity") +
      theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(), 
        axis.text.x = element_text(family = "Arial", size = 12),
        axis.text.y = element_text(family = "Arial", size = 12), 
        legend.text = element_text(family = "Arial", size = 10))

Beta Diversity

#CLR before Euclidean distance for Aitchison distances
#Center log ratio the phyloseq file
alo_clr <- microbiome::transform(ps_data, "clr")
#PCA via phyloseq
ord_clr <- phyloseq::ordinate(alo_clr, "RDA") 

#ord_clr <- phyloseq::ordinate(alo_clr, "PCoA") 

plot_ordination(alo_clr, ord_clr, color = "Area_Cond")

#Plot scree plot
phyloseq::plot_scree(ord_clr) +
  geom_bar(stat="identity", fill = "blue") +
  labs(x = "Axis", y = "Proportion of Variance")

#Examine eigenvalues and % proportion of variance explained
head(ord_clr$CA$eig)
##       PC1       PC2       PC3       PC4       PC5       PC6 
## 201.42564 123.57687 109.34320 100.35991  86.12584  79.91789
#What proportion of variance explained in actual understandable form as a decimal
sapply(ord_clr$CA$eig[1:10], function(x) x / sum(ord_clr$CA$eig))
##        PC1        PC2        PC3        PC4        PC5        PC6        PC7 
## 0.08508721 0.05220195 0.04618930 0.04239453 0.03638170 0.03375931 0.03193174 
##        PC8        PC9       PC10 
## 0.03022906 0.02759772 0.02710738
clr1 <- ord_clr$CA$eig[1] / sum(ord_clr$CA$eig)
clr2 <- ord_clr$CA$eig[2] / sum(ord_clr$CA$eig)


phyloseq::plot_ordination(ps_data, ord_clr, type="samples", color="Area_Cond")+
  geom_point() +
  coord_fixed(clr2 / clr1) +
  stat_ellipse(aes(group = Area_Cond), linetype = 2)+
  scale_color_manual(values = c("red", "black"))+
  theme(legend.title = element_blank(), legend.position = "none")+
  labs(title = "C")

ggsave("figures/Fig1c.pdf", width = 4, height = 6)
ord <- ordinate(alo_clr, method = "PCoA", distance = "euclidean")

# Extract ordination coordinates and merge with metadata
ord_df <- plot_ordination(alo_clr, ord, type = "samples", justDF = TRUE)

# Add metadata: Site condition and Person_ID
ord_df$Area_Cond <- sample_data(alo_clr)$Area_Cond
ord_df$Person_ID <- sample_data(alo_clr)$Person_ID

# Plot with lines connecting samples from the same individual
ggplot(ord_df, aes(x = Axis.1, y = Axis.2, color = Area_Cond)) +
  geom_point(size = 3) +
  geom_line(aes(group = Person_ID), color = "gray70", linewidth = 0.5) +
  theme_minimal() +
  labs(title = "PCoA (CLR + Euclidean): Afflicted vs. Normal Sites",
       x = "PCoA Axis 1", y = "PCoA Axis 2",
       color = "Site Type") +
  theme(legend.position = "right")

#Generate distance matrix
clr_dist_matrix <- phyloseq::distance(alo_clr, method = "euclidean")

#ADONIS test for Age -> Also significant!
vegan::adonis2(clr_dist_matrix ~phyloseq::sample_data(alo_clr)$Age_Cat2, permutations = 10000)
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 10000
## 
## vegan::adonis2(formula = clr_dist_matrix ~ phyloseq::sample_data(alo_clr)$Age_Cat2, permutations = 10000)
##          Df SumOfSqs      R2      F    Pr(>F)    
## Model     2     8106 0.05188 1.7511 9.999e-05 ***
## Residual 64   148135 0.94812                     
## Total    66   156241 1.00000                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ADONIS test for Gender -> Also significant!
vegan::adonis2(clr_dist_matrix ~phyloseq::sample_data(alo_clr)$Gender, permutations = 10000)
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 10000
## 
## vegan::adonis2(formula = clr_dist_matrix ~ phyloseq::sample_data(alo_clr)$Gender, permutations = 10000)
##          Df SumOfSqs      R2      F Pr(>F)    
## Model     1     4663 0.02985 1.9998  3e-04 ***
## Residual 65   151577 0.97015                  
## Total    66   156241 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ADONIS test for Normal vs Afflicted
vegan::adonis2(clr_dist_matrix ~phyloseq::sample_data(alo_clr)$Area_Cond, permutations = 10000, strata=sample_df$Person_ID)
## Permutation test for adonis under reduced model
## Blocks:  strata 
## Permutation: free
## Number of permutations: 10000
## 
## vegan::adonis2(formula = clr_dist_matrix ~ phyloseq::sample_data(alo_clr)$Area_Cond, permutations = 10000, strata = sample_df$Person_ID)
##          Df SumOfSqs     R2      F    Pr(>F)    
## Model     1     2954 0.0189 1.2524 9.999e-05 ***
## Residual 65   153287 0.9811                     
## Total    66   156241 1.0000                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ADONIS test for Normal vs Afflicted -> Significant
vegan::adonis2(clr_dist_matrix ~phyloseq::sample_data(alo_clr)$Area_Cond, permutations = 10000)
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 10000
## 
## vegan::adonis2(formula = clr_dist_matrix ~ phyloseq::sample_data(alo_clr)$Area_Cond, permutations = 10000)
##          Df SumOfSqs     R2      F Pr(>F)  
## Model     1     2954 0.0189 1.2524 0.0491 *
## Residual 65   153287 0.9811                
## Total    66   156241 1.0000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Dispersion test for Area_Cond (Normal vs Afflicted)
dispr <- vegan::betadisper(clr_dist_matrix, phyloseq::sample_data(alo_clr)$Area_Cond)
anova(dispr)
## Analysis of Variance Table
## 
## Response: Distances
##           Df Sum Sq Mean Sq F value  Pr(>F)   
## Groups     1 1159.1 1159.13  8.0916 0.00594 **
## Residuals 65 9311.3  143.25                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Dispersion test for age
dispr2 <- vegan::betadisper(clr_dist_matrix, phyloseq::sample_data(alo_clr)$Age_Cat2)
anova(dispr2)
## Analysis of Variance Table
## 
## Response: Distances
##           Df Sum Sq Mean Sq F value   Pr(>F)    
## Groups     2 2926.4 1463.22  12.332 2.95e-05 ***
## Residuals 64 7593.8  118.65                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Dispersion test for Gender
dispr3 <- vegan::betadisper(clr_dist_matrix, phyloseq::sample_data(alo_clr)$Gender)
anova(dispr3)
## Analysis of Variance Table
## 
## Response: Distances
##           Df  Sum Sq Mean Sq F value Pr(>F)
## Groups     1   263.7  263.69  1.6132 0.2086
## Residuals 65 10625.2  163.47
#Dispersion plot for Area_Cond + boxplot
plot(dispr, main = "Ordination Centroids and Dispersion Labeled: Aitchison Distance", sub = "")

boxplot(dispr, main = "", xlab = "")

#Dispersion plot for Age_Cat + boxplot
plot(dispr2, main = "Ordination Centroids and Dispersion Labeled:Aitchison Distance", sub = "")

boxplot(dispr2, main = "", xlab = "")

#Dispersion plot for Gender + boxplot
plot(dispr3, main = "Ordination Centroids and Dispersion Labeled:Aitchison Distance", sub = "")

boxplot(dispr3, main = "", xlab = "")

#Generate distance matrix
clr_dist_matrix <- phyloseq::distance(alo_clr, method = "euclidean")

# Distance matrix to long format
dist_long <- as.data.frame(as.table(as.matrix(clr_dist_matrix)))
colnames(dist_long) <- c("Sample1", "Sample2", "Distance")

#  Add metadata
meta <- sample_df[, c("X.SampleID", "Person_ID", "Area_Cond")]

dist_long <- dist_long %>%
  inner_join(meta, by = c("Sample1" = "X.SampleID")) %>%
  inner_join(meta, by = c("Sample2" = "X.SampleID"),
             suffix = c("_1", "_2"))

#keep a copy of the matrix 
dist_long_1 <- dist_long

# Remove redundant/self comparisons
dist_long <- dist_long %>%
  filter(Sample1 < Sample2)

# Define "Within" vs "Between"
dist_long <- dist_long %>%
  mutate(Comparison = if_else(Person_ID_1 == Person_ID_2, "Within", "Between")) %>% 
  mutate(Condition = Area_Cond_1)

#keep a copy 
dist_long_2 <-dist_long


ggplot(dist_long, aes(x = Comparison, y = Distance, fill = Condition)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  #facet_wrap(~ Condition) +
  labs(title = "Within vs Between-Person Distances by Condition",
       y = "Distance", x = NULL) +
  theme_minimal() +
  scale_fill_manual(values = c("black", "red"))+
  stat_compare_means(method = "wilcox")+
  theme(legend.title = element_blank())

ggsave("figures/varianceWcondition.pdf")
## Saving 7 x 5 in image
ggplot(dist_long, aes(x = Comparison, y = Distance)) +
  geom_boxplot(outlier.shape = NA, alpha = 0.7) +
  #facet_wrap(~ Condition) +
  labs(title = "Within vs Between-Person Distances",
       y = "Distance", x = NULL) +
  theme_minimal() +
  stat_compare_means(method = "wilcox")

ggsave("figures/variance.pdf")
## Saving 7 x 5 in image

Dysbiosis Scores

#center log ratio (clr) transformation
clr <- microbiome::transform(ps_data, "clr")

#make a distance matrix
dist.matrix <- phyloseq::distance(clr, "euclidean") #euclidean distances of clr transformed data are Aitchison distances

#set Normal as the baseline
ref.samples <- sample_names(subset_samples(ps_data, 
                                           Area_Cond == "Normal")) 


#calculate dysbiosis scores
dysbiosis <- euclideanDistCentroids(ps_data,
                                      dist_mat = dist.matrix,
                                      use_squared = F,
                                      group_col = "Area_Cond",
                                      control_label = "Normal",
                                      case_label = "Afflicted")


# order the data 
dysbiosis$Area_Cond <- factor(dysbiosis$Area_Cond, 
                              levels = c("Normal", "Afflicted"))
#AOC plot to check sensitivity and specificity of the model
AUCplot <- pROC::roc(as.factor(dysbiosis$Area_Cond),
                   dysbiosis$CentroidDist_score,
                   #direction= ">",
                   plot=TRUE,
                   ci = TRUE,
                   auc.polygon=TRUE,
                   max.auc.polygon=TRUE,
                   print.auc=TRUE)
## Setting levels: control = Normal, case = Afflicted
## Setting direction: controls < cases

#calculate average dysbiosis score by group
dysbiosis %>% 
  group_by(Area_Cond) %>% 
  summarise(mean(CentroidDist_score))
## # A tibble: 2 × 2
##   Area_Cond `mean(CentroidDist_score)`
##   <fct>                          <dbl>
## 1 Normal                         -2.39
## 2 Afflicted                       1.24
#check to see if there is a signicant different in dysbiosis scores between groups 
ggplot(dysbiosis, aes(Area_Cond, CentroidDist_score)) +
  geom_boxplot()+
  geom_point()+
  stat_compare_means(method = "wilcox.test")+
  theme_bw()

#set a dysbiosis threshold
dysbiosis <- dysbiosis %>% 
  mutate(dysbiotic = case_when(CentroidDist_score > 0 ~ "Dysbiotic", 
                              CentroidDist_score < 0 ~ "Normobiosis" )) 


#check which hair sites got dysbiosis assignments
dysbiosis %>% 
  group_by(dysbiotic, Area_Cond) %>% 
  count(dysbiotic) %>% 
  arrange(desc(n))
## # A tibble: 4 × 3
## # Groups:   dysbiotic, Area_Cond [4]
##   dysbiotic   Area_Cond     n
##   <chr>       <fct>     <int>
## 1 Normobiosis Normal       30
## 2 Dysbiotic   Afflicted    24
## 3 Normobiosis Afflicted    11
## 4 Dysbiotic   Normal        2
plotDysbiosisGradient(df=dysbiosis,
                                  score="CentroidDist_score",
                                  high_line = 0.0,
                                  group_var = "Area_Cond",
                                  group_colors=c("Normal" = "black", 
                                                 "Afflicted"= "brown3"),
                                
                        
                                  bg_colors = rev(brewer.pal(9, "Reds")),
                                  jitter_width = 0.1) +
  labs(y="Dysbiosis Score", subtitle = " ") +
  theme(legend.title = element_blank())+
  # adjust the x and y values to fit to plot
  ggplot2::annotate("text",  x = 0.65, y = .1, label = "Center",color="white")

ggsave("Figures/Fig2.pdf", height =6, width =5)

plotDysbiosisGradient(df = dysbiosis,
                      score = "CentroidDist_score",
                      high_line = 0.0,
                      group_var = "Area_Cond",
                      group_colors = c("Normal" = "black", "Afflicted" = "brown3"),
                      bg_colors = rev(brewer.pal(9, "Reds")),
                      jitter_width = 0) +
  geom_text(data = dysbiosis,
            aes(y = CentroidDist_score, 
                x = 1, # or adjust to match the y-location of points
                label = Person_ID),
            #position = position_jitter(width = 0.1, height = 0),
            size = 3, 
            vjust = -1, 
            check_overlap = TRUE) +
  labs(y = "Dysbiosis Score", subtitle = " ") +
  theme(legend.title = element_blank()) +
  annotate("text", x = 0.65, y = 0.1, label = "Center", color = "white")

#ggsave("figures/labeleddysbiois.pdf")

Mixed Effects Model

ps_data@sam_data$Area_Cond <- as.factor(ps_data@sam_data$Area_Cond)

ps_data@sam_data$Area_Cond <- relevel(ps_data@sam_data$Area_Cond, ref = "Normal")

levels(ps_data@sam_data$Area_Cond)
## [1] "Normal"    "Afflicted"
# Aggregate Table by OTU
df.agg <- stats::aggregate(Abundance ~ OTU + X.SampleID, ps_data.df, FUN = sum)

# Make Blank Cells Unspecified
df.agg$OTU[df.agg$OTU == ""] <- "Unassigned"

# Manipulate Data Table to Short Version
df.agg.otus <- as.data.frame(reshape::cast(df.agg, X.SampleID ~ OTU, sum))
## Using Abundance as value column.  Use the value argument to cast to override this choice
data <- df.agg.otus #preserve data before making a matrix

# Make SampleID Rownames and Remove Column
rownames(df.agg.otus) <- df.agg.otus$X.SampleID
df.agg.otus$X.SampleID <- NULL # to make a matrix

# Check Dimensions
dim(df.agg.otus)
### Create Variables for Model ###
# Sample Data
df.meta <- sample_data(ps_data)

Age <- df.meta$Age

Area_Cond <- df.meta$Area_Cond

# create data frame containing all the variables of interest
df.meta <- as_data_frame(df.meta)

df.meta.var <- df.meta %>%  dplyr::select(X.SampleID, Person_ID, Age, Age_Cat, Gender)

data.var <- left_join(data, df.meta, by = "X.SampleID" )
#mem <- NBZIMM::mms(y = df.agg.otus, fixed = ~ Area_Cond, data = data.var,  random = ~ 1 | Age,  min.p = 0.2, method = "nb")

#saveRDS(mem,  "output/mem")
mem <- readRDS("output/mem")

# Extract data

out <- mem$call$fixed

out = NBZIMM::fixed(mem)$dist
out = out[out[,2]!="(Intercept)", ]
res = out[, 3:5]

#remove the Area_CondAfflicted from the rowname
rownames(res) <- gsub("--Area_CondAfflicted", "", rownames(res)) 

# make a data frame
res.data <- as.data.frame(res)

#add OTU column to the data frame
res.data$OTU <- rownames(res.data)

#filter sigificant results
res.data <- res.data %>% 
  filter(pvalue <0.05)

#join results with all data
res.data.df <- left_join(res.data, ps_data.df, by = "OTU")

#clean up genus column
res.data.df$Genus <- sub("g__", "", res.data.df$Genus)

#number of OTUs that were more abundant in afflicted sites 
res.data %>% filter(Estimate > 0)
##                               Estimate Std.Error       pvalue
## 1000547                      0.9123754 0.3727216 1.873928e-02
## 1001007                      1.0322221 0.3951044 1.250536e-02
## 1015518                      1.2098636 0.4374904 8.482653e-03
## 103411                       1.3047249 0.2841304 4.122552e-05
## 1062356                      0.9123062 0.3466475 1.191626e-02
## 1065569                      0.8742012 0.3266396 1.065055e-02
## 1079708                      1.2304012 0.3526376 1.172251e-03
## 109591                       1.3050612 0.3301427 2.980036e-04
## 1096610                      0.9777314 0.2428713 2.390798e-04
## 130864                       2.5411772 0.2451496 5.057877e-13
## 134726                       0.9503889 0.3709128 1.416904e-02
## 137183                       1.7767134 0.4684931 4.823857e-04
## 159711                       2.5480086 0.3672748 2.018415e-08
## 161677                       3.4444797 0.8135205 1.262391e-04
## 1646259                      0.9771663 0.3179838 3.757922e-03
## 1790396                      0.9031755 0.3597093 1.608074e-02
## 2119418                      1.2021677 0.3029106 2.841902e-04
## 224222                       1.4834588 0.5373076 8.582639e-03
## 245648                       1.0903653 0.3840787 7.011469e-03
## 274365                       1.4454274 0.7082210 4.773126e-02
## 2751958                      1.8342521 0.6823155 1.033336e-02
## 28246                        1.0241342 0.4266782 2.100792e-02
## 30062                        2.0953411 0.4024426 5.757770e-06
## 3225199                      0.8084541 0.2856933 7.180177e-03
## 3359884                      0.9676768 0.2276376 1.198082e-04
## 3384047                      0.6420287 0.2725796 2.336936e-02
## 3600504                      1.9216245 0.5612035 1.412402e-03
## 4128270                      0.8545624 0.3860617 3.248289e-02
## 4296424                      0.6971983 0.3342814 4.327394e-02
## 4297695                      0.9268213 0.3759882 1.797814e-02
## 4302049                      0.7082492 0.3422008 4.482184e-02
## 4305837                      1.0592354 0.5027460 4.128867e-02
## 4306048                      1.0845270 0.3107363 1.168727e-03
## 4307484                      0.9331459 0.2913348 2.630740e-03
## 4309301                      1.1485033 0.2941490 3.449434e-04
## 4309764                      1.2948719 0.3764286 1.350158e-03
## 4311939                      1.5500920 0.4906645 2.969187e-03
## 4315974                      0.8492640 0.3077919 8.620200e-03
## 4318672                      0.7210365 0.2949983 1.890577e-02
## 4321559                      0.9719827 0.4615224 4.136781e-02
## 4337755                      1.4453299 0.4121670 1.114546e-03
## 4349519                      1.5107386 0.3988903 4.896826e-04
## 4353264                      1.4938954 0.3920611 4.572572e-04
## 4361528                      1.2858893 0.3858308 1.830275e-03
## 4364813                      0.7985477 0.3281459 1.939875e-02
## 4404220                      0.8296380 0.3661475 2.880624e-02
## 4418009                      1.5682956 0.3432408 4.431877e-05
## 4425214                      1.0297908 0.2504964 1.843111e-04
## 4427114                      1.4514783 0.5174584 7.658180e-03
## 4428042                      0.9148156 0.3181235 6.367975e-03
## 4439603                      0.6946130 0.2576785 1.014145e-02
## 4446902                      0.9824038 0.3226795 4.059508e-03
## 4449458                      2.7875026 0.6809617 1.944596e-04
## 4449609                      2.2084170 0.8799984 1.613113e-02
## 4453501                      2.0146149 0.3748345 3.336690e-06
## 4455767                      1.6624036 0.3885685 1.100881e-04
## 4466006                      1.3448948 0.3793517 9.969286e-04
## 4469359                      1.8910959 0.5558044 1.502378e-03
## 4471251                      1.8759443 0.5563844 1.639662e-03
## 4476950                      1.9553430 0.5083117 4.101906e-04
## 4483015                      1.7043849 0.3903989 8.387549e-05
## 494906                       0.7934991 0.3766175 4.128775e-02
## 504674                       1.9218533 0.8540771 2.986485e-02
## 544480                       1.3617864 0.4154170 2.134162e-03
## 545299                       0.6579111 0.3218337 4.738608e-02
## 64653                        2.2775105 0.5710855 2.680467e-04
## 710275                       1.1140223 0.3261624 1.447357e-03
## 711275                       1.4098026 0.3000002 2.933922e-05
## 753638                       1.0641256 0.3347341 2.811115e-03
## 823916                       0.9387880 0.3245235 6.086809e-03
## 859700                       1.3725355 0.3945559 1.208025e-03
## 863124                       1.5220274 0.4191908 7.769671e-04
## 864465                       0.6446034 0.3063618 4.154948e-02
## 886735                       0.8147830 0.4010442 4.870373e-02
## 900973                       1.2470355 0.1959480 1.312688e-07
## 907241                       2.1813456 0.6587331 1.943737e-03
## 912997                       0.9621059 0.3876269 1.725108e-02
## 913149                       0.3822520 0.1887448 4.938733e-02
## 933546                       1.1619670 0.3162489 6.842268e-04
## 963344                       2.1799632 0.5169328 1.330143e-04
## 964220                       0.9964333 0.3748705 1.115396e-02
## New.CleanUp.ReferenceOTU7806 1.4892082 0.3994054 5.829405e-04
## New.CleanUp.ReferenceOTU955  1.0747083 0.4031269 1.093489e-02
##                                                       OTU
## 1000547                                           1000547
## 1001007                                           1001007
## 1015518                                           1015518
## 103411                                             103411
## 1062356                                           1062356
## 1065569                                           1065569
## 1079708                                           1079708
## 109591                                             109591
## 1096610                                           1096610
## 130864                                             130864
## 134726                                             134726
## 137183                                             137183
## 159711                                             159711
## 161677                                             161677
## 1646259                                           1646259
## 1790396                                           1790396
## 2119418                                           2119418
## 224222                                             224222
## 245648                                             245648
## 274365                                             274365
## 2751958                                           2751958
## 28246                                               28246
## 30062                                               30062
## 3225199                                           3225199
## 3359884                                           3359884
## 3384047                                           3384047
## 3600504                                           3600504
## 4128270                                           4128270
## 4296424                                           4296424
## 4297695                                           4297695
## 4302049                                           4302049
## 4305837                                           4305837
## 4306048                                           4306048
## 4307484                                           4307484
## 4309301                                           4309301
## 4309764                                           4309764
## 4311939                                           4311939
## 4315974                                           4315974
## 4318672                                           4318672
## 4321559                                           4321559
## 4337755                                           4337755
## 4349519                                           4349519
## 4353264                                           4353264
## 4361528                                           4361528
## 4364813                                           4364813
## 4404220                                           4404220
## 4418009                                           4418009
## 4425214                                           4425214
## 4427114                                           4427114
## 4428042                                           4428042
## 4439603                                           4439603
## 4446902                                           4446902
## 4449458                                           4449458
## 4449609                                           4449609
## 4453501                                           4453501
## 4455767                                           4455767
## 4466006                                           4466006
## 4469359                                           4469359
## 4471251                                           4471251
## 4476950                                           4476950
## 4483015                                           4483015
## 494906                                             494906
## 504674                                             504674
## 544480                                             544480
## 545299                                             545299
## 64653                                               64653
## 710275                                             710275
## 711275                                             711275
## 753638                                             753638
## 823916                                             823916
## 859700                                             859700
## 863124                                             863124
## 864465                                             864465
## 886735                                             886735
## 900973                                             900973
## 907241                                             907241
## 912997                                             912997
## 913149                                             913149
## 933546                                             933546
## 963344                                             963344
## 964220                                             964220
## New.CleanUp.ReferenceOTU7806 New.CleanUp.ReferenceOTU7806
## New.CleanUp.ReferenceOTU955   New.CleanUp.ReferenceOTU955
#number of unique genera that were differentially abundant
unique(res.data.df$Genus)
##  [1] "Streptococcus"     "Staphylococcus"    "Corynebacterium"  
##  [4] "Acinetobacter"     NA                  "Finegoldia"       
##  [7] "Lactobacillus"     ""                  "Anaerococcus"     
## [10] "Paracoccus"        "Bacteroides"       "Pseudomonas"      
## [13] "Actinomyces"       "Campylobacter"     "Chryseobacterium" 
## [16] "Rothia"            "Neisseria"         "Porphyromonas"    
## [19] "Gemella"           "Actinobacillus"    "Sphingomonas"     
## [22] "Veillonella"       "Haemophilus"       "Granulicatella"   
## [25] "Peptoniphilus"     "Fusobacterium"     "Dialister"        
## [28] "Enhydrobacter"     "Alloiococcus"      "Propionibacterium"
#count the Genus assignments of those OTUs
res.data.df %>% 
  distinct(OTU, .keep_all=TRUE) %>% 
  filter(Genus != is.na(Genus)) %>% 
  filter(Genus != "") %>% 
  group_by(Genus) %>% 
  count() %>% 
  arrange(desc(n))
## # A tibble: 28 × 2
## # Groups:   Genus [28]
##    Genus               n
##    <chr>           <int>
##  1 Corynebacterium    18
##  2 Streptococcus      17
##  3 Acinetobacter       8
##  4 Anaerococcus        8
##  5 Staphylococcus      7
##  6 Enhydrobacter       3
##  7 Haemophilus         2
##  8 Lactobacillus       2
##  9 Rothia              2
## 10 Actinobacillus      1
## # ℹ 18 more rows
res.data.df %>% 
  filter(pvalue < 0.05) %>% 
  filter(Genus != "Unassigned") %>% 
  filter(Genus != "") %>% 
  mutate(Genus1 = reorder(Genus, Estimate)) %>% 
  ggplot(aes(x = Genus1, y = Estimate)) +
    geom_point(alpha = 0.5) +
    geom_errorbar(aes(ymin=Estimate-Std.Error, ymax=Estimate+Std.Error), width=.1, alpha = 0.5) +
    coord_flip() +
   labs(y = "Estimate", x = " ")+
    geom_hline(yintercept = 0, lty = 2) +
    theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(), 
       axis.text.y = element_text(size = 12)) 

ggsave("figures/Fig4.pdf")
## Saving 7 x 5 in image
res.data.df %>% 
  filter(pvalue <0.05) %>% 
  filter(Species != "Unassigned") %>% #remove any OTUs that don't have assigned species
   filter(Species != "s__") %>% 
  mutate(Species1 = reorder(Species, Estimate)) %>% 
  ggplot(aes(x = Species1, y = Estimate, color = Genus)) +
    geom_point(alpha = 0.5)+
    #geom_jitter()+
    geom_errorbar(aes(ymin=Estimate-Std.Error,
                      ymax=Estimate+Std.Error), width=.1, alpha = 0.5) +
    coord_flip() +
    geom_hline(yintercept = 0, lty = 2) +
    theme(axis.line.y = element_blank(),
        axis.ticks.y = element_blank(), 
       axis.title.y= element_blank(),
        axis.text.x = element_text(family = "Arial", size = 10),
        axis.text.y = element_text(family = "Arial", size = 10))

Random Forest with dysbiosis score

#set dysbiosis threshold and add column 
dysbiosis <- dysbiosis %>% 
  mutate(dysbiotic = case_when(CentroidDist_score > 0 ~ "Dysbiotic", 
                              CentroidDist_score < 0 ~ "Normobiosis" )) 

#add dysbiosis score and assignments to the sample data
ps_data@sam_data$score <- dysbiosis$dysbiotic

ps_data@sam_data$numericScore <- dysbiosis$CentroidDist_score

#Prep data
predict <- t(otu_table(ps_data))

#Check dimensions
dim(predict)
## [1]   67 6009
#Create response variable
res <- as.factor(sample_data(ps_data)$score)

#Combine predict variable and response variable into one dataframe
machine.data <- data.frame(res, predict)

#check dimensions
dim(machine.data)
## [1]   67 6010
# 
#class <- randomForest(res ~ ., data = machine.data, ntree = 1001, importance = TRUE, nodesize = 1)

#saveRDS(class, "output/RF") 

class <- readRDS("output/RF") #by default a random forest will be different every time you run it, so here we're loading saved RF previously made

print(class)
## 
## Call:
##  randomForest(formula = res ~ ., data = machine.data, ntree = 1001,      importance = TRUE, nodesize = 1) 
##                Type of random forest: classification
##                      Number of trees: 1001
## No. of variables tried at each split: 77
## 
##         OOB estimate of  error rate: 20.9%
## Confusion matrix:
##             Dysbiotic Normobiosis class.error
## Dysbiotic          15          11  0.42307692
## Normobiosis         3          38  0.07317073
confusiondf <- as.data.frame(class$confusion)

confusionfixed <- data.frame(Actual = c("Dysbiotic", "Dysbiotic", "Normobiosis", "Normobiosis"), Predicted = c("Dysbiotic", "Normobiosis", "Dysbiotic", "Normobiosis"), Count = c(confusiondf[1,1], confusiondf[1,2], confusiondf[2,1], confusiondf[2,2]), Freq = c(confusiondf[1,3], confusiondf[1,3], confusiondf[2,3], confusiondf[2,3]))

ggplot(confusionfixed, aes(x = Predicted, y = Actual, fill = Count)) +
  geom_tile()+
  geom_text(aes(label = Count), fontface="bold", color = "white") +
  geom_text(aes(label = Freq, vjust = 3), fontface="bold") +
  theme_bw() +
  theme(legend.position="none") +
  labs(x = "Predicted", y = "Actual")

#extract the predictors from the RF 
imp.taxa <- randomForest::importance(class)

#make a dataframe 
imp.taxa <- data.frame(predictors = rownames(imp.taxa), imp.taxa)

#rename the object for easier coding
imp <- imp.taxa

# Order the predictor levels by importance (Mean decrease accuracy)
imp.sort <- arrange(imp, desc(MeanDecreaseAccuracy))
imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors)

# Select the top 100 predictors
imp.100 <- imp.sort[1:100, ]

# Change Column name to say "OTUID" rather than "predictors"
colnames(imp.100)[which(names(imp.100) == "predictors")] <- "OTUID"

# Remove "X" from OTUID column
imp.100$OTUID <- gsub("X", "", paste(imp.100$OTUID))

# Make Taxa table from phyloseq object a data frame
otu_df <- as.data.frame(tax_table(ps_data))

# Make OTU IDs (row names) into column
otu_df$OTUID <- rownames(otu_df)

# Merge two data frames using matched column so that we know which taxa are assigned to the OTUs
imp100.merged <- merge(imp.100, otu_df, by = "OTUID")

#rename the object
RFtop100_Otus <- imp100.merged


#count the number of unique genera
unique(RFtop100_Otus$Genus)
##  [1] "g__Streptococcus"     "g__Corynebacterium"   "g__Veillonella"      
##  [4] "g__Anaerococcus"      NA                     "g__Staphylococcus"   
##  [7] "g__Pseudomonas"       "g__Finegoldia"        "g__Peptoniphilus"    
## [10] "g__Carnobacterium"    "g__"                  "g__Actinomyces"      
## [13] "g__Acinetobacter"     "g__[Prevotella]"      "g__Blautia"          
## [16] "g__Haemophilus"       "g__Rothia"            "g__Mycobacterium"    
## [19] "g__Gemella"           "g__Actinobacillus"    "g__Micrococcus"      
## [22] "g__Aggregatibacter"   "g__Propionibacterium" "g__Granulicatella"   
## [25] "g__Megasphaera"       "g__Paracoccus"        "g__Bifidobacterium"  
## [28] "g__Dialister"         "g__Alloiococcus"
#cleanup Genus names
imp100.merged$Genus <- sub("g__", "", imp100.merged$Genus)

#plot 
imp100.merged %>% 
  mutate(Genus = reorder(Genus, MeanDecreaseAccuracy)) %>% 
  filter(Genus != "") %>% 
  filter(Genus != is.na(Genus)) %>% 
   ggplot(aes(x = reorder(Genus, MeanDecreaseAccuracy, sum), 
             y = MeanDecreaseAccuracy, 
             fill = OTUID)) +  # Stacked by OTU
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  labs(x = "Top taxa for classifying dysbiotic scalp sites",  y = "Mean Decrease Accuracy (%)")+
  theme(legend.position = "none")

ggsave("figures/Fig3.pdf", height = 5, width =6)

Compare Important Taxa

#extract diff abundant taxa from the mixed effected model
res.data <- res.data %>% filter(pvalue <0.05)

#check the intersection of OTUs between the random forest and mixed effects models
shared_sig.OTU <- intersect(res.data$OTU, RFtop100_Otus$OTU)

shared_sig.OTU #42 shared otus 
##  [1] "1000547"                      "1015518"                     
##  [3] "1062356"                      "1065569"                     
##  [5] "1079708"                      "109591"                      
##  [7] "1096610"                      "161677"                      
##  [9] "30062"                        "3225199"                     
## [11] "3384047"                      "4296424"                     
## [13] "4302049"                      "4306048"                     
## [15] "4307484"                      "4309301"                     
## [17] "4309764"                      "4311939"                     
## [19] "4315974"                      "4337755"                     
## [21] "4349519"                      "4404220"                     
## [23] "4425214"                      "4439603"                     
## [25] "4446902"                      "4453501"                     
## [27] "4455767"                      "4466006"                     
## [29] "4469359"                      "4471251"                     
## [31] "4476950"                      "4483015"                     
## [33] "494906"                       "544480"                      
## [35] "711275"                       "753638"                      
## [37] "859700"                       "863124"                      
## [39] "886735"                       "912997"                      
## [41] "New.CleanUp.ReferenceOTU7806" "New.ReferenceOTU11"
#make a new data frame keeping only OTUs sig in both models
OTU.filt.df <- ps_data.df %>% filter(OTU %in% shared_sig.OTU) 

#repeat for the relative abundance data frame 
OTU.filt_rel.df <- ps_rel.df%>% filter(OTU %in% shared_sig.OTU)
#add a variable to each set of results indicating which test they came from 

res.data$test <- "Mixed Effects Model" 

RFtop100_Otus$test <- "Random Forest Classifer"

#change the column name so they match
RFtop100_Otus$OTU <- RFtop100_Otus$OTUID

#create a shared data frame
shared <- full_join(res.data, RFtop100_Otus, by = c("OTU", "test"))

#keep only OTUs that are in both models
keep <- union(res.data$OTU, RFtop100_Otus$OTU)
shared <- shared %>%  filter(OTU %in% keep)

#make a list with the two tests seperated
data_list <- split(shared$OTU, shared$test)

#make a venn diagram
ggvenn(data_list, label_sep = "\n", fill_color = c("white", "white"), show_elements = F)

Confirm differentially abundant OTUs

#transform to log10 abundances
OTU.filt.df$log10Abund <- log10( 1 + OTU.filt.df$Abundance)

# Run Wilcoxon test for each OTU and store p-values
otu_pvals_notsig <- OTU.filt.df %>%
  group_by(OTU) %>%
  summarise(p_value = wilcox.test(log10Abund ~ Area_Cond)$p.value) %>% 
  filter(p_value > 0.05)
## Warning: There were 42 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `p_value = wilcox.test(log10Abund ~ Area_Cond)$p.value`.
## ℹ In group 1: `OTU = "1000547"`.
## Caused by warning in `wilcox.test.default()`:
## ! cannot compute exact p-value with ties
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 41 remaining warnings.
#Extract OTU names from the data frame of shared OTUs by both RF and Mixed effects model 
OTUs <- sort(unique(OTU.filt.df$OTU))

#OTUs to remove
remove <- otu_pvals_notsig$OTU

#Significant OTUs
OTUs_sig <- OTUs[!OTUs %in% remove]

#keep only significant OTUs
OTU.filt.df <- OTU.filt.df %>% 
  filter(OTU %in% OTUs_sig)

OTU.filt_rel.df <- OTU.filt_rel.df%>% 
  filter(OTU %in% OTUs_sig)

sort(unique(OTU.filt_rel.df$Genus))
## [1] ""                "Acinetobacter"   "Anaerococcus"    "Corynebacterium"
## [5] "Finegoldia"      "Gemella"         "Rothia"          "Streptococcus"
sort(unique(OTU.filt.df$Genus))
## [1] ""                "Acinetobacter"   "Anaerococcus"    "Corynebacterium"
## [5] "Finegoldia"      "Gemella"         "Rothia"          "Streptococcus"
OTU.filt_rel.df %>% 
  filter(Genus != "") %>%
  group_by(Genus) %>%  # Group by Genus before reordering
  mutate(OTU = factor(OTU, levels = unique(OTU[order(Abundance, decreasing = TRUE)]))) %>% 
  ggplot(aes(OTU, Abundance, fill = Area_Cond)) +
  geom_boxplot(alpha =0.7, outliers = F) +
  geom_point(aes(OTU, Abundance), position = position_dodge(width = 0.75), size = 0.3, alpha = 0.7) +
  scale_y_log10(breaks = scales::breaks_log(n = 10),     
                labels = scales::number_format(accuracy = 0.0001)) +
  coord_flip() +
  labs(y= "Relative Abundance") +
  scale_fill_manual(values = c("red", "black")) +
  theme(legend.title = element_blank(),
  axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5), 
  axis.text.y = element_blank())+
  facet_wrap(~Genus, ncol = 3, scales = "free_y") 
## Warning in scale_y_log10(breaks = scales::breaks_log(n = 10), labels = scales::number_format(accuracy = 1e-04)): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## Warning: Removed 680 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggsave("figures/Figure5.pdf")
## Saving 7 x 5 in image
## Warning in scale_y_log10(breaks = scales::breaks_log(n = 10), labels =
## scales::number_format(accuracy = 1e-04)): log-10 transformation introduced
## infinite values.
## Warning in scale_y_log10(breaks = scales::breaks_log(n = 10), labels =
## scales::number_format(accuracy = 1e-04)): log-10 transformation introduced
## infinite values.
## Warning: Removed 680 rows containing non-finite outside the scale range
## (`stat_boxplot()`).